---
geometry: margin = 0.7in
output: pdf_document
---

\pagenumbering{gobble}

```{r setup, warning = FALSE, message = FALSE, echo = F}
library(tidyverse)
library(stargazer)
library(xtable)
library(RColorBrewer)
library(gridExtra)
library(Zelig)
options(xtable.comment = FALSE)
options(xtable.table.placement = "!h")
library(kableExtra)
options(dplyr.summarise.inform = FALSE)
library(scales)
library(patchwork)
library(gcookbook)
library(cowplot)

############################### DATA CLEANING #########################################

# load public-use data file - download and unzip "Feb15 Data Release" from Pew's Website
# set working directory
setwd()
# read in data
public <- foreign::read.spss("Feb15 Multiracial_cleaned.sav", to.data.frame = T, use.value.labels = F)   

# load in dataset containing new variables constructed from public-use data and write-in responses in restricted-use data
new <- read.csv("~/new_variables.csv")

# merge by CaseID
data <- left_join(public, new, by = 'CaseID')


# RECODE DEMOGRAPHIC VARIABLES
# recode gender
# attributes(merged$PPGENDER) #ref = male
data$ppgender <- as.factor(data$PPGENDER) 
data$ppgender <- factor(data$ppgender, levels = c(1,2), labels = c("Male", "Female"))

#recode region
data$region <- as.factor(data$PPREG4) # 1 = Northeast, 2 = Midwest, 3 = South 4 = West
data$region <- factor(data$region, labels = c("Northeast", "Midwest", "South", "West"))
data$region <- relevel(data$region, ref = "South") 

# recode age so that reference age/0 is equal to 18 (the youngest age possible)
data$age <- data$PPAGE - 18

# recode education
data$educ2cat <- ifelse(data$PPEDUCAT == 4, 1, 0)
data$educ2cat <- factor(data$educ2cat, levels = c(0,1), labels = c("No BA", "BA plus"))

#recode income -- take midpoint of each category, divide by 1000, and log
#midpoint for top category derived from a pareto-based formula; Hout 2004  
# 681 observations in second highest category and 1162 obs in highest category
V <- (log(681 + 1162) - log(1162))/(log(175000) - log(150000))
M <- 175000*0.5*(1 + (V/(V - 1)))
data <- data %>% mutate(income = case_when(PPINCIMP == 1 ~ 2500,
                                           PPINCIMP == 2 ~ (7499 - 5000)/2 + 5000,
                                           PPINCIMP == 3 ~ (9999 - 7500)/2 + 7500,
                                           PPINCIMP == 4 ~ (12499 - 10000)/2 + 10000,
                                           PPINCIMP == 5 ~ (14999 - 12500)/2 + 12500,
                                           PPINCIMP == 6 ~ (19999 - 15000)/2 + 15000,
                                           PPINCIMP == 7 ~ (24999 - 20000)/2 + 20000,
                                           PPINCIMP == 8 ~ (29999 - 25000)/2 + 25000,
                                           PPINCIMP == 9 ~ (34999 - 30000)/2 + 30000,
                                           PPINCIMP == 10 ~ (39999 - 35000)/2 + 35000,
                                           PPINCIMP == 11 ~ (49999 - 40000)/2 + 40000,
                                           PPINCIMP == 12 ~ (59999 - 50000)/2 + 50000,
                                           PPINCIMP == 13 ~ (74999 - 60000)/2 + 60000,
                                           PPINCIMP == 14 ~ (84999 - 75000)/2 + 75000,
                                           PPINCIMP == 15 ~ (99999 - 85000)/2 + 85000,
                                           PPINCIMP == 16 ~ (124999 - 100000)/2 + 100000,
                                           PPINCIMP == 17 ~ (149999 - 125000)/2 + 125000,
                                           PPINCIMP == 18 ~ (174999 - 150000)/2 + 150000,
                                           PPINCIMP == 19 ~ M))
# express in thousands
data$income <- data$income/1000
#logged household income
data$logged.income <- log(data$income)

# survey language
data$xspanish <- as.factor(data$XSPANISH) #ref english

###############################################################################
# Creating Ancestry Variables
#any_X = 1 if R selected X category for mom, dad, grandparents
#any_X also = 1 if R said R had ggparents or ancestors of diff race than self/mom/dad/gparents,
#and that race was X category

data <- data %>% mutate(# White ancestry
                        any_white = case_when(QR2a == 1 ~ 1,
                                              QR3a == 1 ~ 1,
                                              QR4a == 1 ~ 1,
                                              QR5 == 1 & QR5a == 1 ~ 1,
                                              TRUE ~ 0),
                        # Hispanic/Latino ancestry
                        any_latino = case_when(QR2b == 1 ~ 1,
                                               QR3b == 1 ~ 1,
                                               QR4b == 1 ~ 1,
                                               QR5 == 1 & QR5b == 1 ~ 1,
                                               TRUE ~ 0),
                        # Black ancestry
                        any_black = case_when(QR2c == 1 ~ 1,
                                              QR3c == 1 ~ 1,
                                              QR4c == 1 ~ 1,
                                              QR5 == 1 & QR5c == 1 ~ 1,
                                              TRUE ~ 0),
                        # Asian ancestry
                        any_asian = case_when(QR2d == 1 ~ 1,
                                              QR3d == 1 ~ 1,
                                              QR4d == 1 ~ 1,
                                              QR5 == 1 & QR5d == 1 ~ 1,
                                              TRUE ~ 0),
                        # American Indian ancestry
                        any_indian = case_when(QR2e == 1 ~ 1,
                                               QR3e == 1 ~ 1,
                                               QR4e == 1 ~ 1,
                                               QR5 == 1 & QR5e == 1 ~ 1,
                                               TRUE ~ 0),
                        # Native Hawaiian or Pacific Islander,
                        any_nhopi = case_when(QR2f == 1 ~ 1,
                                              QR3f == 1 ~ 1,
                                              QR4f == 1 ~ 1,
                                              QR5 == 1 & QR5f == 1 ~ 1,
                                              TRUE ~ 0),
                        # Other
                        any_other = case_when(QR2g == 1 ~ 1,
                                              QR3g == 1 ~ 1,
                                              QR4g == 1 ~ 1,
                                              QR5 == 1 & QR5g == 1 ~ 1,
                                              TRUE ~ 0))

################################################################################################
# racial ancestry regimes
data$regime <- case_when(data$any_black == 1 ~ "Black",
                         data$any_black == 0 & data$any_latino == 1 ~ "Hispanic",
                         data$any_black == 0 & data$any_latino == 0 & 
                           data$any_asian == 1 ~ "Asian",
                         data$any_black == 0 & data$any_latino == 0 &
                           data$any_asian == 0 & data$any_indian == 1 ~ "Indigenous",
                         data$any_black == 0 & data$any_latino == 0 &
                           data$any_asian == 0 & data$any_nhopi == 1 ~ "Indigenous",
                         data$any_black == 0 & data$any_latino == 0 &
                           data$any_asian == 0 & data$any_indian == 0 & data$any_nhopi == 0 ~ "Other")

## convert to factor and make "Other" the reference category
data$regime <- factor(data$regime) %>%
                  fct_relevel(., "Other", "Black", "Hispanic", "Asian", "Indigenous")

########################################################################################
# Outcome variables

################################## 
# Reports Multiracial Ancestry 

# if R did not report MOM, DAD, GPARENT, OR ANCESTOR ancestry, we can't tell presence and timing  
# of MR ancestry. In that case, R is flagged as missing (n = 95) 

data <- data %>% mutate(ancestry_incomplete = case_when(QR2a == 9 ~ 1,
                                                        QR2b == 9 ~ 1,
                                                        QR2c == 9 ~ 1,
                                                        QR2d == 9 ~ 1, 
                                                        QR2e == 9 ~ 1,
                                                        QR2f == 9 ~ 1,
                                                        QR2g == 9 ~ 1,
                                                        QR3a == 9 ~ 1, 
                                                        QR3b == 9 ~ 1,
                                                        QR3c == 9 ~ 1,
                                                        QR3d == 9 ~ 1, 
                                                        QR3e == 9 ~ 1,
                                                        QR3f == 9 ~ 1, 
                                                        QR3g == 9 ~ 1,
                                                        QR4a == 9 ~ 1, 
                                                        QR4b == 9 ~ 1,
                                                        QR4c == 9 ~ 1, 
                                                        QR4d == 9 ~ 1, 
                                                        QR4e == 9 ~ 1,
                                                        QR4f == 9 ~ 1,
                                                        QR4g == 9 ~ 1,
                                                        # if QR5a is missing, QR5b-g is also missing
                                                        QR5 == 1 & QR5a == 9 ~ 1,
                                                        TRUE ~ 0))

#### people are MR Aware if they report two or more racial categories in their ancestry
data <- data %>% 
  mutate(total_ancestry_races = any_white + any_black + any_latino + any_asian + 
                              any_indian +  any_nhopi + any_other,
         reportsMR = ifelse(total_ancestry_races >= 2, 1, 0)) 

# PLUS those who said "yes" to other ancestry despite not specifying which category (QR5 == 1)
# (aka they reported 1 racial category across all ancestors but said earlier ancestors were another race)
data$reportsMR <- ifelse(data$QR5 == 1, 1, data$reportsMR)

# remove ancestry_incomplete cases from reportsMR coding
data$reportsMR <- ifelse(data$ancestry_incomplete == 1, NA, data$reportsMR)

##########################################################
### Multiracial Self-ID (Selects 2 or more races for self)

# recode Refused to Missing
# (if QR1a == 9 for a respondent, then QR1b-g is also 9; R refused to answer QR1)
data[data$QR1a == 9, "QR1a"] <- NA
data[data$QR1b == 9, "QR1b"] <- NA
data[data$QR1c == 9, "QR1c"] <- NA
data[data$QR1d == 9, "QR1d"] <- NA
data[data$QR1e == 9, "QR1e"] <- NA
data[data$QR1f == 9, "QR1f"] <- NA
data[data$QR1g == 9, "QR1g"] <- NA

# sum up the number of races reported
data <- data %>% mutate(num_races = QR1a + QR1b + QR1c + QR1d + QR1e + QR1f + QR1g,
# MR Self- ID: 1 if respondent reports 2 or more racial categories for self, 0 if not
                        MRid = ifelse(num_races >= 2, 1, 0)) 

### recoding write-in responses
# R's who self-identify as "Some Other Race" may fall into the following scenarios:
# 1) If they self-id as SOR along with TWO OR MORE other categories (QR1g == 1 & num_races >= 3) they are coded as MR-identifying

# 2) If they self-id as SOR along with ONE other category (QR1g == 1 & num_races == 2), then there are two possibilities:
#    A) The write-in response was irrelevant or redundant with the other category they selected 
#       (e.g. write-in was "German" when they checked "White" and "Some Other Race"), so the "race_R" variable was recoded 
#       to a single race (one character long string).
#    B) The write-in response indicates that the respondent was expressing multiracial identity. In 
#       these cases, the "race_R" variable would be recoded to indicate 2+ races/origins and would be 2+ characters long.

# 3) If they self-id as SOR ONLY (QR1g == 1 & num_races == 1), then there are two possibilities:
#    A) The write-in response did NOT indicate multiracial self-identification. In these cases, 
#       the "race_R" variable would remain blank.
#    B) The write-in response DID indicate multiracial self-identification. In these cases, the "race_R"
#       variable would be recoded to a 2+ character string indexing the categories their write-in response mentioned. 


# Cases that belong to 1) do not need to be recoded because they are counted as multiracially self-identifying anyway.
# Cases that belong to 2A) NEED TO BE CODED BACK TO MONORACIAL because would be counted as multiracial self-id in our coding scheme, even though manual review indicates that they are not expressing multiracial identity.
# Cases that belong to 2B) do not need to be recoded because they are counted as multiracial self-id in our coding scheme and manual review agrees.
# Cases that belong to 3A) do not need to be recoded because they are counted as monoracial self-id in our coding scheme and manual review agrees.
# Cases that belong to 3B) NEED TO BE CODED TO MULTIRACIAL because they are counted as monoracial in our codings scheme, but manual review indicates that they are expressing multiracial identity.

# Cases that should be recoded back to monoracial (2A)
monoracial_cases <- unname(unlist(data %>% filter(num_races == 2 & QR1g == 1) %>% 
  mutate(races = nchar(race_R)) %>%
  filter(races == 1) %>%
  select(CaseID))) 
data$MRid <- ifelse(data$CaseID %in% monoracial_cases, 0, data$MRid)

# Cases that should be recoded to multiracial (3B)
multiracial_other_cases <- unname(unlist(data %>% filter(num_races == 1 & QR1g == 1) %>% 
  mutate(races = nchar(race_R)) %>%
  filter(races >= 1) %>%
  select(CaseID)))    
data$MRid <- ifelse(data$CaseID %in% multiracial_other_cases, 1, data$MRid)


# relabel levels
data$MRid <- factor(data$MRid, levels = c(0,1), labels = c("Selects 1 race", "Selects 2+ races"))

########### Combine MR awareness and identification to one variable (useful for some descriptives)
data <- data %>% mutate(AwareID = as.factor(case_when(reportsMR == 0 ~ "Monoracial Ancestry",
                                                      reportsMR == 1 & MRid == "Selects 1 race" ~ 
                                                        "Multiracial Ancestry;\nMonoracial Self-identification",
                                                      TRUE ~ "Multiracial Ancestry and\nSelf-identification")))
data$AwareID = factor(data$AwareID, 
                      levels = c("Monoracial Ancestry",
                                 "Multiracial Ancestry;\nMonoracial Self-identification",
                                 "Multiracial Ancestry and\nSelf-identification"))

#########################################################################################################
# recode generation (following previous RA's approach but counting Some Other Race as category)

# create dummy for having Some Other Race in ancestry and 
# recode race_mom, race_dad, race_grand, and race_great variables so that "O" (for "Other") is added 

# note: if the respondent didn't select QR#a-f before (e.g. only SOR was selected),
# race_mom/dad/grand/great will be blank
data <- data %>% mutate(QR2gz = ifelse(QR2g == 1, "O", NA),
               QR3gz = ifelse(QR3g == 1, "O", NA),
               QR4gz = ifelse(QR4g == 1, "O", NA),
               QR5gz = ifelse(QR5g == 1, "O", NA)) %>%
  mutate(new_race_mom = ifelse(is.na(QR2gz), race_mom, paste(race_mom, QR2gz, sep = "")),
         new_race_dad = ifelse(is.na(QR3gz), race_dad ,paste(race_dad, QR3gz, sep = "")),
         new_race_grand = ifelse(is.na(QR4gz), race_grand, paste(race_grand, QR4gz, sep = "")),
         new_race_great = ifelse(is.na(QR5gz), race_great, paste(race_great, QR5gz, sep = ""))) 

# generation dummy variables
data <- data %>%
########## identify monoracial (0 generations)
# if one racial category is reported in ancestry AND does NOT answer "yes" to QR5 
# (same results as matching string of race_mom, race_dad, etc.)
  mutate(gen0 = ifelse(total_ancestry_races == 1 & QR5 != 1, 1, 0),
########## identify 1st gen
# if both parents are single race AND those races are different AND QR5 != 1
         gen1 = ifelse(new_race_mom != new_race_dad &
                         nchar(new_race_mom) == 1 &
                         nchar(new_race_dad) == 1 &
                         QR5 != 1, 1, 0))
# but these include cases where grandparents have a value that aren't in parents 
grand_notinpar_cases <- data %>%
  filter(gen1 == 1) %>%
  # put space between characters in grandparent race string
  mutate(race_grand_space = trimws(gsub('(.{1})', '\\1 ',new_race_grand))) %>%
  # separate rows and match, flag == 1 if there is a race in grandparents that's not in mom or dad
  separate_rows(race_grand_space, sep = " ") %>%
  mutate(grand_new_flag = ifelse(race_grand_space == new_race_mom | race_grand_space == new_race_dad,
                              0, 1)) %>%
  # save CaseIDs of cases with grandparent new races
  group_by(CaseID) %>%
  summarise(grand_new_races = sum(grand_new_flag)) %>%
  filter(grand_new_races >= 1) %>%
  select(CaseID)
# recode these cases as NOT first gen
data$gen1 <- ifelse(data$CaseID %in% unlist(grand_notinpar_cases), 0, data$gen1)
  
# create generation variable - everyone who is not gen 0 or gen 1 is 2
data <- data %>% mutate(gens2cat = case_when(gen0 == 1 ~ 0,
                                             gen1 == 1 ~ 1,
                                             TRUE ~ 2))
# unless you have incomplete ancestry info, then you are missing
data$gens2cat <- ifelse(data$ancestry_incomplete, NA, data$gens2cat)
```

```{r, echo = F}
#### CREATE ANALYTIC SAMPLES

########### Drop cases with incomplete ancestry OR incomplete self-ID
data <- filter(data, ancestry_incomplete == 0 & !is.na(MRid))

########### create MR only dataset 
MR <- filter(data, reportsMR == 1)

# turn generation variable into factor
MR$gens2cat <- as.factor(MR$gens2cat)
```



```{r, echo = F, message = F, warning = F, fig.show ='hold',fig.align='center', fig.height= 7, fig.width= 7}
########################################################### FIGURES ###############################################################

# Figure 1: Self-ID by gender

# collapse AwareID into two categories -- Multiracial and Monoracial Self-ID
## Note: the rare (0.1%) cases that do not report MR ancestry but self-id with 2+ races are 
## counted as monoracial Self-ID both here and in AwareID variable
### Because in our theoretical model, you cannot self-id with more than one race if you are not 
## aware of having multiracial ancestry
data$AwareID_2cat = as.factor(ifelse(data$reportsMR == 1 & data$MRid == "Selects 2+ races",  
                                 "Multiracial Self-identification", 
                                 "Monoracial Self-identification"))

# left panel: table - conventional view
just_id_table <- data %>% 
  group_by(AwareID_2cat, ppgender) %>%
  summarize(Freq = n()) %>%
  rename(Gender = ppgender) %>%
  group_by(Gender) %>%
  mutate(PercentLabel = paste(as.character(format(round(100 * Freq/sum(Freq), 1), nsmall = 1)),
                              "%", sep = "")) 
just_id_table$Gender <- relevel(just_id_table$Gender, ref = "Female")

#set colors
colors_0 <- c(brewer.pal(3, "Blues")[3], brewer.pal(3, "Blues")[1])

#plot
fig1a <- ggplot(just_id_table, aes(x = Gender, y = Freq, 
                                  fill = forcats::fct_rev(AwareID_2cat))) + 
  geom_bar(stat = "identity", position = "stack") +
  labs(y = "Count", x = "",
       fill = "") +
  scale_fill_manual(values = colors_0) +
  theme_minimal() +
  geom_text(aes(x = Gender, y = Freq, label = PercentLabel, 
                group = forcats::fct_rev(AwareID_2cat)),
            position = position_stack(vjust = .5), size = 3) +
  theme(axis.text.x = element_text(size = 8, face = "bold"),
        axis.title.y = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8),
        legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 2)) +
  scale_y_continuous(label = comma, limits = c(0, 12700), breaks = c(2500, 5000, 7500, 10000, 12500)) +
  theme(legend.key.height=unit(1, "cm")) +
  theme(panel.grid.major.x = element_blank())
  

############################################################################################
# table - with ancestry/self-id distinction
aware_id_table <- data %>% 
  group_by(AwareID, ppgender) %>%
  summarize(Freq = n()) %>%
  rename(Gender = ppgender) %>%
  group_by(Gender) %>%
  mutate(PercentLabel = paste(as.character(format(round(100 * Freq/sum(Freq), 1), nsmall = 1)),
                              "%", sep = "")) 
aware_id_table$Gender <- relevel(aware_id_table$Gender, ref = "Female")

#set colors
colors1 <- c(brewer.pal(3, "Blues")[3], brewer.pal(3, "Blues")[2], brewer.pal(3, "Blues")[1])

#labels of statistical significance
#chisq.test(data$ppgender, data$AwareID) #p = 2.584e-10
#chisq.test(data$ppgender, data$AwareID_2cat) #p = 0.4854
labels.df <- data.frame(Gender = "Male" ,
                        Freq = 11500,
                        AwareID = "Monoracial Ancestry",
                        lab = "***")

#plot
fig1b <- ggplot(aware_id_table, aes(x = Gender, y = Freq, 
                                   fill = forcats::fct_rev(AwareID))) + 
  geom_bar(stat = "identity", position = "stack",
           aes(color = forcats::fct_rev(AwareID))) +
  labs(y = "Count", x = "",
       fill = "") +
  scale_fill_manual(values = colors1) +
  scale_color_manual(values = c('black', 'black', NA), guide = F) +
  theme_minimal() +
  geom_text(aes(x = Gender, y = Freq, label = PercentLabel, 
                group = fct_rev(AwareID)),
            position = position_stack(vjust = 0.5), size = 3) +
  geom_text(data = labels.df, label = "***", hjust = 3.2, vjust = -1.1, size = 5) +
  theme(axis.text.x = element_text(size = 8, face = "bold"),
        axis.title.y = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8),
        legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 3)) +
  scale_y_continuous(label = comma, limits = c(0, 12700), breaks = c(2500, 5000, 7500, 10000, 12500)) +
  theme(legend.key.height=unit(1, "cm")) +
  theme(panel.grid.major.x = element_blank())

#combine
fig1 <- cowplot::plot_grid(fig1a, fig1b, ncol = 2, align = "h")

fig1

```

\newpage

```{r, echo = F, message = F, warning = F, fig.show ='hold',fig.align='center', fig.height = 7, fig.width = 7}
# Figure 2: % Self-id as proportion of aware (total and by generation)

# table MRid x gender - all generations
id_table <- MR %>% 
  group_by(MRid, ppgender) %>%
  summarize(Freq = n()) %>%
  rename(Gender = ppgender) %>%
  group_by(Gender) %>%
  mutate(PercentLabel = paste(as.character(format(round(100 * Freq/sum(Freq), 1), nsmall = 1)),
                              "%", sep = ""),
         Generation = c("All Generations")) %>%
  select(Gender, Generation, MRid, Freq, PercentLabel)
id_table$Gender <- relevel(id_table$Gender, ref = "Female")

# table MRid x gender - by generation
id_gen_table <- MR %>%
  mutate(gens2cat = factor(gens2cat, labels = c("First Generation", "Second Generation or Higher"))) %>%
  group_by(ppgender, gens2cat, MRid) %>%
  summarise(Freq = n()) %>%
  rename(Gender = ppgender, Generation = gens2cat) %>%
  group_by(Gender, Generation) %>%
  mutate(PercentLabel = paste(as.character(format(round(100 * Freq/sum(Freq), 1), nsmall = 1)), 
                              "%", sep = ""))
id_gen_table$Gender <- relevel(id_gen_table$Gender, ref = "Female")

# combine tables
id_all_table <- bind_rows(id_table, id_gen_table)
id_all_table$Generation <- factor(id_all_table$Generation,
                                     levels = c("All Generations",
                                                "First Generation",
                                                "Second Generation or Higher"))

#set colors
colors2 <- c(brewer.pal(3, "Blues")[3], brewer.pal(3, "Blues")[2])

################################################ Statistical significance and labels
#gen1 <- filter(MR, gens2cat == 1)
#gen2 <- filter(MR, gens2cat == 2)
#chisq.test(MR$ppgender, MR$MRid) #p = 0.01619 *
#chisq.test(gen1$ppgender, gen1$MRid) #p = 0.05233
#chisq.test(gen2$ppgender, gen2$MRid) #p = 0.000549 ***
#labels 
label1.df <- data.frame(Gender = "Male" ,
                        Freq = 2500,
                        MRid = "Selects 1 race",
                        Generation = "All Generations",
                        label = "*")
#label2.df = data.frame(Gender = "Male",
#                       Freq = 450,
#                       MRid = "Selects 1 race",
#                       Generation = "First Generation",
#                       label = "*")
label3.df = data.frame(Gender = "Male",
                       Freq = 2200,
                       MRid = "Selects 1 race",
                       Generation = "Second Generation or Higher",
                       label = "***")

label1.df$Generation <- factor(label1.df$Generation,
                               levels = c("All Generations",
                                                "First Generation",
                                                "Second Generation or Higher"))

#label2.df$Generation <- factor(label2.df$Generation,
#                               levels = c("All Generations",
#                                                "First Generation",
#                                               "Second Generation or Higher"))

label3.df$Generation <- factor(label3.df$Generation,
                               levels = c("All Generations",
                                                "First Generation",
                                                "Second Generation or Higher"))

########################################### PLOT ##################################

fig2 <- ggplot(id_all_table, aes(x = Gender, y = Freq, fill = fct_rev(MRid))) + 
  geom_bar(stat = "identity", position = "stack") +
  labs(y = "Count", x = "\nGender and Multiracial Generation",
       fill = "") +
  scale_fill_manual(values = colors2) +
  scale_y_continuous(label = comma, 
                     limits = c(0,2600),
                     breaks = c(500, 1000, 1500, 2000, 2500)) +
  theme_minimal() +
  geom_text(aes(x = Gender, y = Freq, label = PercentLabel, 
                group = fct_rev(MRid)),
        position = position_stack(vjust = 0.5), size = 3) +
  facet_wrap(.~Generation, strip.position = "bottom") +
  theme(strip.placement = "outside",
        strip.text.x = element_text(size=8, face = "bold")) +
  geom_text(data = label1.df, label = "*", hjust = 6.6, size = 5) +
  #geom_text(data = label2.df, label = "*", hjust = 6.6, size = 5) +
  geom_text(data = label3.df, label = "***", hjust = 2.5, size = 5) +
  theme(axis.text.x = element_text(size = 8, face = "bold", margin = margin(t = -10)),
        axis.text.y = element_text(size = 8),
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.title.x = element_text(size = 10, face = "bold"),
        legend.position = c(0.99, 1.02),
        legend.justification = c("right", "top"),
        legend.box.just = c("right", "top"),
        legend.margin = margin(0, 0, 0, 0),
        legend.text=element_text(size=8),
        legend.key.height = unit(0.5, "cm")) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.spacing.x = unit(-0.1, "lines"))

fig2

```


\newpage

```{r, echo = F, message = F, warning = F, fig.show ='hold',fig.align='center', fig.height = 5.8, fig.width = 7.5}
# Figure 3:
# Predicted Probablities for Self-ID: id_gender_gen_fit
# first difference between men and women by generation and regime 
id_out <- zelig(MRid ~ ppgender*gens2cat + regime + age + region +
                           educ2cat + logged.income + xspanish, 
                   model = "logit",
                   data = MR,
                   cite = FALSE)

# set x's 
x_out_BlackF1 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_BlackF2 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_BlackM1 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_BlackM2 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_LatinxF1 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_LatinxF2 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_LatinxM1 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_LatinxM2 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_AsianF1 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_AsianF2 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_AsianM1 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_AsianM2 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_IndianF1 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_IndianF2 <- setx(id_out,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_IndianM1 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

x_out_IndianM2 <- setx(id_out,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = 1)

# simulation output - save 
set.seed(08540)

# write function to save mean and 95% coverage bounds of FIRST DIFFERENCES
# Female EV - Male EV
save_95fds <- function(model, x_valuesM, x_valuesF){
  # simulate 10000 times 
  simulations <- sim(model, x = x_valuesM, x1 = x_valuesF, num = 10000)
  # save the 10000 expected values
  fds <- unlist(simulations[["sim.out"]][["x1"]][["fd"]])
  # save the mean
  fd_mean <- mean(fds)
  # save the lower bound - the 250th lowest value of the string of expected values
  fd_lower <- sort(fds)[250]
  # save the upper bound - the 9750th lowest value of the string of evs
  fd_upper <- sort(fds)[9750]
  # store as vector
  output <- c(fd_mean, fd_lower, fd_upper)
  return(output)
}

# run function on all 10 regime x generation combos, save as dataframe
id_fds <- as.data.frame(rbind(
                       save_95fds(id_out, x_out_BlackM1, x_out_BlackF1),
                       save_95fds(id_out, x_out_BlackM2, x_out_BlackF2),
                       save_95fds(id_out, x_out_LatinxM1, x_out_LatinxF1),
                       save_95fds(id_out, x_out_LatinxM2, x_out_LatinxF2),
                       save_95fds(id_out, x_out_AsianM1, x_out_AsianF1),
                       save_95fds(id_out, x_out_AsianM2, x_out_AsianF2),
                       save_95fds(id_out, x_out_IndianM1, x_out_IndianF1),
                       save_95fds(id_out, x_out_IndianM2, x_out_IndianF2)))
# rename col names
colnames(id_fds) <- c("Mean", "Lower", "Upper")
# append race/gen combos
id_fds$Regime <- c(rep("Black", 2), 
                   rep("Hispanic", 2), 
                   rep("Asian", 2),
                   rep("Indigenous", 2))
# levels are backwards bc ggplot will be mirrored
id_fds$Regime <- factor(id_fds$Regime,
                        levels = c("Indigenous", "Asian",
                                    "Hispanic", "Black")) 
id_fds$Generation <- as.factor(rep(c("First Generation", "Second Generation\nor Higher"), 4))

# remove leading zeros
dropLeadingZero <- function(l){
  str_replace(l, '0(?=.)', '')
}

# plot
fig3 <- ggplot(id_fds, aes(x = Regime, y = Mean, 
                   group = interaction(Regime, fct_rev(Generation)), 
                   color = Regime)) +
  geom_point(position = position_dodge(width = 0.3), 
             size = 2, 
             aes(shape = factor(Regime)),
             show.legend = FALSE) +
  geom_linerange(aes(ymin = Lower, ymax = Upper,
                     linetype = fct_rev(Generation)), 
                position = position_dodge(width = 0.3)) +
  labs(x = "", y = "\nDifference in Expected Probabilities (Female - Male)",
       linetype = "",
       color = "") +
  scale_y_continuous(labels = dropLeadingZero,
                     limits = c(-.15, .22),
                     breaks = c(-.15, -.1, -.05, 0, .05, .1, .15, .2)) +
  scale_color_manual(values = rep(c(brewer.pal(3, "Blues")[3],
                                brewer.pal(3, "Reds")[3],
                                brewer.pal(3, "Purples")[3],
                                brewer.pal(3, "Greens")[3]), 2),
                     guide = FALSE) +
  scale_linetype_manual(values = c("solid", "dashed"),
                        guide = guide_legend(reverse=TRUE)) +
  theme_minimal() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme(legend.text = element_text(size = 8),
        axis.text.x = element_text(size = 8, face = "bold"),
        axis.text.y = element_text(size = 8, face = "bold", margin = margin(t = -13)),
        axis.title.x = element_text(size = 10, face = "bold"),
        legend.position = c(0.99, 1.02),
        legend.direction = "vertical",
        legend.justification = c("right", "top"),
        legend.box.just = c("right", "top"),
        #legend.margin = margin(l = -20, r = 0, t = -10, b= 0),
        legend.margin = margin(0, 0, 0, 0),
        legend.key.height = unit(0.5, "cm"),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank()) 
  

# convert lines in legend to horizontal
fig3_grob <- ggplotGrob(fig3)
library(grid)
fig3_grob <- editGrob(grid.force(fig3_grob), gPath("key-[3,4]-1-[1, bg]"),
                      grep = TRUE, global = TRUE,
                      x0 = unit(0, "npc"), y0 = unit(0.5, "npc"), 
                      x1 = unit(1, "npc"), y1 = unit(0.5, "npc"))

grid.draw(fig3_grob)
dev.off()

fig3_grob

```

\newpage


```{r, echo = F, message = F, warning = F, fig.show ='hold',fig.align='center', fig.width = 8, fig.height = 8}
# Figure 4: 
# Expected probabilities of reporting MR-ID from id_regimgen_gender_fit model 
# 16 combos: 2 gender x 2 gen x 4 race

# specify model
id_out_2 <- zelig(MRid ~ ppgender*gens2cat*regime + age + region + educ2cat + logged.income + xspanish,
                   model = "logit",
                   data = MR,
                   cite = FALSE)

# set x's 
x_out_BlackF1_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_BlackF2_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_BlackM1_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_BlackM2_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Black",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_LatinxF1_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_LatinxF2_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_LatinxM1_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_LatinxM2_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Hispanic",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_AsianF1_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_AsianF2_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_AsianM1_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_AsianM2_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Asian",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_IndianF1_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "1",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_IndianF2_2 <- setx(id_out_2,
                    ppgender = "Female",
                    gens2cat = "2",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_IndianM1_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "1",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")
x_out_IndianM2_2 <- setx(id_out_2,
                    ppgender = "Male",
                    gens2cat = "2",
                    regime = "Indigenous",
                    age = median(MR$age),
                    region = "South",
                    educ2cat = "No BA",
                    logged.income = median(MR$logged.income),
                    xspanish = "1")

# write function to save mean and 95% coverage bounds
set.seed(08540)
save_95evs <- function(model, x_values){
  # simulate 10000 times 
  simulations <- sim(model, x = x_values, num = 10000)
  # save the 10000 expected values
  evs <- unlist(simulations[["sim.out"]][["x"]][["ev"]])
  # save the mean
  ev_mean <- mean(evs)
  # save the lower bound - the 250th lowest value of the string of expected values
  ev_lower <- sort(evs)[250]
  # save the upper bound - the 9750th lowest value of the string of evs
  ev_upper <- sort(evs)[9750]
  # store as vector
  output <- c(ev_mean, ev_lower, ev_upper)
  return(output)
}

# run function on all 16 combos, save as dataframe
id_evs <- as.data.frame(rbind(
                       save_95evs(id_out_2, x_out_BlackF1_2),
                       save_95evs(id_out_2, x_out_BlackF2_2),
                       save_95evs(id_out_2, x_out_BlackM1_2),
                       save_95evs(id_out_2, x_out_BlackM2_2),
                       save_95evs(id_out_2, x_out_LatinxF1_2),
                       save_95evs(id_out_2, x_out_LatinxF2_2),
                       save_95evs(id_out_2, x_out_LatinxM1_2),
                       save_95evs(id_out_2, x_out_LatinxM2_2),
                       save_95evs(id_out_2, x_out_AsianF1_2),
                       save_95evs(id_out_2, x_out_AsianF2_2),
                       save_95evs(id_out_2, x_out_AsianM1_2),
                       save_95evs(id_out_2, x_out_AsianM2_2),
                       save_95evs(id_out_2, x_out_IndianF1_2),
                       save_95evs(id_out_2, x_out_IndianF2_2),
                       save_95evs(id_out_2, x_out_IndianM1_2),
                       save_95evs(id_out_2, x_out_IndianM2_2)))
# rename col names
colnames(id_evs) <- c("Mean", "Lower", "Upper")
# append race/gender combos
id_evs$regime <- c(rep("Black", 4), rep("Hispanic", 4), 
                      rep("Asian", 4), rep("Indigenous", 4))
id_evs$regime <- factor(id_evs$regime,
                           levels = c("Black", "Hispanic", "Asian", "Indigenous"))
id_evs$gender <- rep(c("Female", "Female", "Male", "Male"), 4)
id_evs$gender <- factor(id_evs$gender, levels = c("Female", "Male"))
id_evs$gen <- rep(c("First Generation", "Second Generation or Higher"), 8)
id_evs$gen <- factor(id_evs$gen, levels = c("First Generation", "Second Generation or Higher"))

# plot
fig4 <- ggplot(id_evs, aes(x = str_wrap(gen, 10), y = Mean, fill = gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), 
                position = position_dodge(width = 0.9), width = 0.2) +
  theme_minimal() +
  facet_grid(. ~regime, scales = "free_x", switch = "x") +
  theme(strip.placement = "outside",
        strip.text.x = element_text(size=8, face = "bold")) +
  labs(y = "Expected Probability", x = "\nMultiracial Generation and Racial Classification Regime",
       fill = "") +
  scale_y_continuous(labels = dropLeadingZero) +
  theme(axis.text.x = element_text(size = 8, face = "bold", margin = margin(t = -10)),
        axis.text.y = element_text(size = 8),
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.title.x = element_text(size = 10, face = "bold"),
        legend.position = c(0.99, 1.02),
        legend.justification = c("right", "top"),
        legend.box.just = c("right", "top"),
        legend.margin = margin(0, 0, 0, 0),
        legend.text=element_text(size=8),
        legend.key.height = unit(0.5, "cm")) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.spacing.x = unit(-0.1, "lines"))

fig4

```

